import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

np.random.seed(42)

df = pd.read_csv('classroom_actions.csv')
df.head()

# The total_days represents the total amount of time
# each student has spent in classroom.
# get the average classroom time for control group
df.drop_duplicates(subset=['id'], inplace=True)
control_mean = df.query('group=="control"').total_days.mean()

# get the average classroom time for experiment group
experiment_mean = df.query('group=="experiment"').total_days.mean()

# display average classroom time for each group
control_mean, experiment_mean

(73.368990384615387, 74.671593533487297)

# compute observed difference in classroom time
obs_diff = experiment_mean - control_mean

# display observed difference
obs_diff

1.3026031488719099

# create sampling distribution of difference in average classroom times
# with boostrapping
diffs = []
for _ in range(10000):
    sample = df.sample(len(df), replace=True) 
    sample_control_mean = sample.query('group=="control"').total_days.mean() 
    sample_experiment_mean = sample.query('group== "experiment"').total_days.mean() 
    diffs.append(sample_experiment_mean - sample_control_mean)

# convert to numpy array
diffs = np.array(diffs)

# plot sampling distribution
plt.hist(diffs)

(array([    5.,    42.,   332.,  1241.,  2587.,  2966.,  1887.,   759.,
          159.,    22.]),
 array([ -1.71018109e+00,  -1.14129540e+00,  -5.72409700e-01,
         -3.52400341e-03,   5.65361693e-01,   1.13424739e+00,
          1.70313309e+00,   2.27201878e+00,   2.84090448e+00,
          3.40979018e+00,   3.97867587e+00]),
 <a list of 10 Patch objects>)

# simulate distribution under the null hypothesis
null_vals =np.random.normal(0, diffs.std(), len(diffs))

# plot null distribution
plt.hist(null_vals)

# plot line for observed statistic
plt.axvline(diffs.mean(), color='r')

<matplotlib.lines.Line2D at 0x7fce88fa3cf8>

# compute p value
(null_vals > diffs.mean()).mean()

0.039600000000000003

	timestamp	id	group	total_days	completed
0	2015-08-10 17:06:01.032740	610019	experiment	97	True
1	2015-08-10 17:15:28.950975	690224	control	75	False
2	2015-08-10 17:34:40.920384	564994	experiment	128	True
3	2015-08-10 17:50:39.847374	849588	experiment	66	False
4	2015-08-10 19:10:40.650599	849826	experiment	34	False